pacman::p_load("patchwork", "tmap", "ExPanDaR", "kableExtra", "ggstatsplot", "plotly", "DT", "tidyverse")EDA: Overview of Suicide Rate in a Particular Year
Step-by-Step Data Preparation
1. Installing and launching required R packages
2. Loading the data
suicidedata_eda_formap <- read_csv("data/suicidedata_eda_formap.csv", show_col_types = FALSE)suicidedata_eda <- read_csv("data/suicidedata_eda.csv", show_col_types = FALSE)3. Dataset overview
We will check the suicidedata_eda as suicidedata_eda_formap is meant to be combined with the sf file (world)
3.1 Missing Values
missing.values <- suicidedata_eda %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
missing.values %>% kable()| key | num.missing |
|---|---|
| DN | 18360 |
| DR | 18360 |
| POP | 18360 |
| SN | 18360 |
| SP | 18360 |
Visualising Missing Values
missing_values <- function(vari = "age_name"){
prepare_missing_values_graph(suicidedata_eda, ts_id = vari)
}By country
missing_values("country")
By gender
missing_values("sex_name")
By age group
missing_values("age_name")
By year
missing_values("year")
Conclusion : The missing values are because of age-standardised (only for Suicide Rates)
3.2 Descriptive statistics
descr_stats <- function(gender = "Both", age = "All ages") {
descr <- prepare_descriptive_table(suicidedata_eda %>%
filter(sex_name == gender,
age_name == age) %>%
select(!c(6,8)) %>%
rename("Number of suicide" = "SN",
"Number of deaths" = "DN",
"Share of deaths from suicide (%)" = "SP",
"Suicide rate" = "SR",
"Mortality rate" = "DR"))
descr$kable_ret %>%
kable_styling("condensed", full_width = F, position = "center")
}descr_stats("Both", "All ages")| N | Mean | Std. dev. | Min. | 25 % | Median | 75 % | Max. | |
|---|---|---|---|---|---|---|---|---|
| Number of suicide | 6,120 | 3,874.828 | 18,425.617 | 0.126 | 93.934 | 532.835 | 1,882.305 | 220,356.720 |
| Number of deaths | 6,120 | 252,265.821 | 920,824.636 | 10.886 | 7,358.955 | 51,487.069 | 169,570.300 | 10,659,491.040 |
| Share of deaths from suicide (%) | 6,120 | 1.394 | 1.067 | 0.105 | 0.691 | 1.118 | 1.787 | 11.812 |
| Suicide rate | 6,120 | 11.339 | 9.395 | 1.427 | 5.424 | 8.224 | 14.601 | 95.565 |
| Mortality rate | 6,120 | 847.052 | 354.531 | 148.258 | 611.956 | 783.986 | 1,019.318 | 10,705.808 |
3.3 Distribution
3.3.1 Histogram of multiple variables
plot_multi_distribution <- function(gender = "Both", age = "All ages") {
suicidedata_hist <- suicidedata_eda %>%
filter(sex_name == gender,
age_name == age) %>%
select(c(7,9,10,11,12)) %>%
rename("Number of suicide" = "SN",
"Number of deaths" = "DN",
"Share of deaths from suicide (%)" = "SP",
"Suicide rate" = "SR",
"Mortality rate" = "DR")
plot <- suicidedata_hist %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap( ~key, ncol=3, scales="free") +
geom_histogram()
ggplotly(plot)
}plot_multi_distribution("Both", "All ages")It does not make sense to look at numbers due to different population sizes. Hence we will only look at Suicide Rate, Mortality Rate and Share of deaths from suicide (%) from now on
3.3.2 Histogram of individual variables
plot_distribution <- function(gender = "Both", age = "All ages", vari = SR, title = "Suicide Rate", binwidth = 2) {
suicidedata_hist <- suicidedata_eda %>%
filter(sex_name == gender,
age_name == age)
suicidedata_summary <- suicidedata_hist %>%
summarise(mean = round(mean({{vari}})),
median = round(median({{vari}})),
ymax = as.numeric(round((IQR({{vari}})*1.5) + quantile({{vari}},0.75))),
ymin = as.integer(min({{vari}})),
rangemax = as.integer(max({{vari}})))
#computing summary statistics of mean, median and lower and upper whiskers in boxplot
var_mean <- suicidedata_summary$mean
var_median <- suicidedata_summary$median
ymax <- suicidedata_summary$ymax
ymin <- suicidedata_summary$ymin
rangemax <- suicidedata_summary$rangemax
#plotting histogram
h1 <- ggplot(suicidedata_hist, aes(x = {{vari}})) +
geom_histogram(color="black", fill="azure4", binwidth = binwidth)
h <- h1 + scale_x_continuous(limits = c(0,rangemax), labels = scales::comma) +
labs(x = paste0(title,", ",gender,", ",age), y = "Count") +
geom_vline(aes(xintercept = var_mean), col="darkblue", linewidth=1, linetype = "dashed") +
annotate("text", x=var_mean*1.05, y=max(ggplot_build(h1)$data[[1]]$count)*1.2, label="Mean:",
size=4, color="darkblue", hjust = 0) +
annotate("text", x=var_mean*1.05, y=max(ggplot_build(h1)$data[[1]]$count)*1.1, label=format(var_mean, big.mark = ","),
size=4, color="darkblue", hjust = 0) +
geom_vline(aes(xintercept = var_median), col="lightpink4", linewidth=1, linetype = "dashed") +
annotate("text", x=var_median*0.95, y=max(ggplot_build(h1)$data[[1]]$count)*1.2, label="Median:",
size=4, color="lightpink4", hjust = 1) +
annotate("text", x=var_median*0.95, y=max(ggplot_build(h1)$data[[1]]$count)*1.1, label=format(var_median, big.mark = ","),
size=4, color="lightpink4", hjust = 1) +
theme(axis.text.x = element_text(size=8))
#plotting boxplot
b <- ggplot(suicidedata_hist, aes(y = {{vari}})) +
geom_boxplot(outlier.colour="firebrick", outlier.shape=16,
outlier.size=1, notch=FALSE) +
coord_flip() + labs(y = "", x = "") +
scale_y_continuous(limits = c(0,rangemax), labels = scales::comma) +
theme(axis.text = element_blank(), axis.ticks = element_blank()) +
stat_boxplot(geom="errorbar", width=0.5) +
annotate("text", x=0.35, y=ymax, label=format(ymax, big.mark = ","),
size=3, color="lightpink4") +
annotate("text", x=0.35, y=ymin, label=format(ymin, big.mark = ","),
size=3, color="lightpink4")
#combining plots
suicide_distri <- b / h + plot_layout(heights = c(1, 4))
suicide_distri
}plot_distribution("Both", "All ages", SR, "Suicide Rate", 2)
3.4 Outliers
outliers_table <- function(gender = "Both", age = "All ages", vari = "Suicide rate") {
outliers <- prepare_ext_obs_table(suicidedata_eda %>%
filter(sex_name == gender,
age_name == age) %>%
select(!c(2,3,4,5)) %>%
rename("Share of deaths from suicide (%)" = "SP",
"Suicide rate" = "SR",
"Mortality rate" = "DR"),
n = 10,
cs_id = "country",
ts_id = "year",
var = vari)
outliers$kable_ret %>%
kable_styling("condensed", full_width = F, position = "center")
}outliers_table("Both", "All ages", "Suicide rate")| country | year | Suicide rate |
|---|---|---|
| Greenland | 1,993 | 95.565 |
| Greenland | 1,994 | 94.916 |
| Greenland | 1,995 | 94.541 |
| Greenland | 1,990 | 93.745 |
| Greenland | 1,996 | 93.518 |
| Greenland | 1,992 | 93.423 |
| Greenland | 1,991 | 93.390 |
| Greenland | 1,997 | 92.778 |
| Greenland | 1,998 | 90.744 |
| Greenland | 1,999 | 88.223 |
| ... | ... | ... |
| Jamaica | 1,993 | 1.586 |
| Sao Tome and Principe | 1,994 | 1.527 |
| Jamaica | 1,992 | 1.526 |
| Sao Tome and Principe | 1,995 | 1.514 |
| Sao Tome and Principe | 1,993 | 1.514 |
| Sao Tome and Principe | 1,992 | 1.467 |
| Sao Tome and Principe | 1,991 | 1.446 |
| Jamaica | 1,990 | 1.439 |
| Jamaica | 1,991 | 1.431 |
| Sao Tome and Principe | 1,990 | 1.427 |
More outliers on the high-end as compared to the other side with Greenland dominating the outliers. There might be interesting pattern here, hence windsorising the data using treat_outliers() function might not be recommended
4. Comparing between countries and regions within a particular year
4.1 Plotting the choropleth map
4.1.1 Joining with world map (tmap object)
The object World is a spatial object of class sf from the sf package; it is a data.frame with a special column that contains a geometry for each row, in this case polygons
Reference - https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html
data("World")Joining the two dataframes together
suicidedata_eda_map <- left_join(World,
suicidedata_eda_formap %>% mutate(across(where(is.numeric), round, 2)),
by = c("iso_a3" = "code")) %>%
select(!c(2,3,4,6,7,8,9,10,11,12,13,14,15)) %>%
mutate(area = as.numeric(str_remove(`area`,
" \\[km\\^2\\]")),
.after = region) %>%
na.omit()4.1.2 Creating function to plot choropleth map
plot_map_eda <- function(year, metric = "SR", gender = "T", style = "jenks"){
metric_text = case_when(metric == "SR" ~ "Suicide rate",
metric == "SP" ~ "Share of deaths from suicide (%)",
metric == "SN" ~ "Number of suicide",
metric == "DN" ~ "Number of deaths",
metric == "DR" ~ "Mortality rate")
age = case_when(metric == "SR" ~ "AS",
metric == "SP" ~ "All",
metric == "SN" ~ "All",
metric == "DN" ~ "All",
metric == "DR" ~ "All")
gender_text = case_when(gender == "T" ~ "Total",
gender == "M" ~ "Male",
gender == "F" ~ "Female")
tmap_mode("view")
tm_shape(suicidedata_eda_map |>
filter(year == year))+
tm_fill(paste0(metric,"_",age,"_",gender),
style = style,
palette="YlOrBr",
id = "country",
title = paste0(metric_text, ", ",gender_text,", ", year),
popup.vars = c(value = paste0(metric,"_",age,"_",gender))) +
tm_borders(col = "grey20",
alpha = 0.5)
}plot_map_eda(2016, "SR", "M", "jenks")